{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing Step\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pip\n",
    "\n",
    "# If you don't have ClearML installed then uncomment this line\n",
    "# ! pip install -U clearml==0.16.2rc0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pandas==1.0.4\n",
    "! pip install -U numpy==1.18.4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "\n",
    "from clearml import Task"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configure Task\n",
    "Instantiate a ClearML Task using `Task.init`. \n",
    "\n",
    "A Configuration dictionary is connected to the task using `Task.connect`. This will enable the [pipeline controller](https://github.com/allegroai/clearml/blob/master/examples/frameworks/pytorch/notebooks/table/tabular_ml_pipeline.ipynb) to access this task's configurations and override the value when the pipeline is executed. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task = Task.init(project_name=\"Tabular Example\", task_name=\"tabular preprocessing\")\n",
    "logger = task.get_logger()\n",
    "configuration_dict = {\n",
    "    \"data_task_id\": \"39fbf86fc4a341359ac6df4aa70ff91b\",\n",
    "    \"fill_categorical_NA\": True,\n",
    "    \"fill_numerical_NA\": True,\n",
    "}\n",
    "configuration_dict = task.connect(\n",
    "    configuration_dict\n",
    ")  # enabling configuration override by clearml\n",
    "print(\n",
    "    configuration_dict\n",
    ")  # printing actual configuration (after override in remote mode)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Data\n",
    "\n",
    "ClearML retrieves that data which will be processed. First, the data task is fetched using `Task.get_task` and inputting the task's ID from the configuration dictionary. Then the data task's artifacts are accessed in order to retrieve the training and validations sets. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_task = Task.get_task(configuration_dict.get(\"data_task_id\"))\n",
    "train_set = data_task.artifacts[\"train_data\"].get().drop(columns=[\"Unnamed: 0\"])\n",
    "val_set = data_task.artifacts[\"val_data\"].get().drop(columns=[\"Unnamed: 0\"])\n",
    "logger.report_table(\n",
    "    title=\"Trainset - raw\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove hour and year from DateTime data\n",
    "def change_time_format(data_frame):\n",
    "    timestamp = pd.to_datetime(data_frame[\"DateTime\"])\n",
    "    months = [d.month for d in timestamp]\n",
    "    data_frame[\"Month\"] = pd.DataFrame(months).astype(\"object\")\n",
    "    data_frame.drop(columns=[\"DateTime\"], inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "\n",
    "train_set = change_time_format(train_set)\n",
    "val_set = change_time_format(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_age_format(data_frame):\n",
    "    age = data_frame[\"AgeuponOutcome\"]\n",
    "    months_age = []\n",
    "    for val in age:\n",
    "        if pd.isnull(val):\n",
    "            months_age.append(val)\n",
    "        else:\n",
    "            amount, time_type = val.split(\" \")\n",
    "            if \"day\" in time_type:\n",
    "                mult = 1.0 / 30\n",
    "            if \"week\" in time_type:\n",
    "                mult = 1.0 / 4\n",
    "            if \"month\" in time_type:\n",
    "                mult = 1.0\n",
    "            if \"year\" in time_type:\n",
    "                mult = 12.0\n",
    "            months_age.append(int(amount) * mult)\n",
    "    data_frame[\"Age\"] = pd.DataFrame(months_age).astype(np.float32)\n",
    "    data_frame.drop(columns=[\"AgeuponOutcome\"], inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "\n",
    "train_set = change_age_format(train_set)\n",
    "val_set = change_age_format(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def change_sex_format(data_frame):\n",
    "    sex_neutered = data_frame[\"SexuponOutcome\"]\n",
    "    sex = []\n",
    "    neutered = []\n",
    "    for val in sex_neutered:\n",
    "        if pd.isnull(val):\n",
    "            sex.append(val)\n",
    "            neutered.append(val)\n",
    "        elif \"Unknown\" in val:\n",
    "            sex.append(np.nan)\n",
    "            neutered.append(np.nan)\n",
    "        else:\n",
    "            n, s = val.split(\" \")\n",
    "            if n in [\"Neutered\", \"Spayed\"]:\n",
    "                neutered.append(\"Yes\")\n",
    "            else:\n",
    "                neutered.append(\"No\")\n",
    "            sex.append(s)\n",
    "\n",
    "    data_frame[\"Sex\"] = pd.DataFrame(sex)\n",
    "    data_frame[\"Neutered\"] = pd.DataFrame(neutered)\n",
    "    data_frame.drop(columns=[\"SexuponOutcome\"], inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "\n",
    "train_set = change_sex_format(train_set)\n",
    "val_set = change_sex_format(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove irrelevant columns\n",
    "def remove_columns(data_frame, list_columns_names=None):\n",
    "    if list_columns_names is not None:\n",
    "        data_frame.drop(columns=list_columns_names, inplace=True)\n",
    "    return data_frame\n",
    "\n",
    "\n",
    "train_set = remove_columns(train_set, [\"Name\", \"OutcomeSubtype\", \"AnimalID\"])\n",
    "val_set = remove_columns(val_set, [\"Name\", \"OutcomeSubtype\", \"AnimalID\"])\n",
    "\n",
    "logger.report_table(\n",
    "    title=\"Trainset - after preprocessing\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Fill NA Values*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "object_columns = train_set.select_dtypes(include=[\"object\"]).copy()\n",
    "numerical_columns = train_set.select_dtypes(include=[\"number\"]).copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice that the configuration dictionary is accessed below to access `fill_categorical_NA`'s value. This value can be overridden by the pipeline controller. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if configuration_dict.get(\"fill_categorical_NA\", True):\n",
    "    for col in object_columns.columns:\n",
    "        if object_columns[col].isnull().sum() > 0:\n",
    "            most_common = Counter(object_columns[col]).most_common(1)[0][0]\n",
    "            print(\n",
    "                'Column \"{}\": replacing null values with \"{}\"'.format(col, most_common)\n",
    "            )\n",
    "            train_set[col].fillna(most_common, inplace=True)\n",
    "            val_set[col].fillna(most_common, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice that the configuration dictionary is accessed below to access `fill_numerical_NA`'s value. This value can be overridden by the pipeline controller. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if configuration_dict.get(\"fill_numerical_NA\", True):\n",
    "    for col in numerical_columns.columns:\n",
    "        if numerical_columns[col].isnull().sum() > 0:\n",
    "            median_val = numerical_columns[col].median()\n",
    "            print(\n",
    "                'Column \"{}\": replacing null values with \"{}\"'.format(col, median_val)\n",
    "            )\n",
    "            train_set[col].fillna(median_val, inplace=True)\n",
    "            val_set[col].fillna(median_val, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop rows with NA values if were chosen not to be filled\n",
    "train_set.dropna(inplace=True)\n",
    "val_set.dropna(inplace=True)\n",
    "if configuration_dict.get(\"fill_categorical_NA\", True) or configuration_dict.get(\n",
    "    \"fill_numerical_NA\", True\n",
    "):\n",
    "    logger.report_table(\n",
    "        title=\"Trainset - after filling missing values\",\n",
    "        series=\"pandas DataFrame\",\n",
    "        iteration=0,\n",
    "        table_plot=train_set.head(),\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Labels Encoding*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = pd.concat([train_set, val_set])\n",
    "outcome_categories = all_data[\"OutcomeType\"].astype(\"category\").cat.categories\n",
    "outcome_dict = {key: val for val, key in enumerate(outcome_categories)}\n",
    "task.upload_artifact(\"Outcome dictionary\", outcome_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in object_columns.columns:\n",
    "    all_data[col] = all_data[col].astype(\"category\").cat.codes\n",
    "train_set = all_data.iloc[: len(train_set.index), :]\n",
    "val_set = all_data.iloc[len(train_set.index) :, :]\n",
    "logger.report_table(\n",
    "    title=\"Trainset - after labels encoding\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# making all variables categorical\n",
    "object_columns_names = object_columns.drop(columns=[\"OutcomeType\"]).columns\n",
    "for col in object_columns_names:\n",
    "    all_data[col] = all_data[col].astype(\"category\")\n",
    "columns_categories = {\n",
    "    col: len(all_data[col].cat.categories) for col in object_columns_names\n",
    "}\n",
    "task.upload_artifact(\"Categories per column\", columns_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task.upload_artifact(\"train_data\", artifact_object=train_set)\n",
    "task.upload_artifact(\"val_data\", artifact_object=val_set)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
